import pandas as pd import matplotlib.pyplot as plt from sklearn.cross_validation import train_test_split from sklearn.linear_model import LinearRegression import numpy as np import os # 1.读取数据 print(os.getcwd()) data = pd.read_csv('Advertising.csv') data.head() X = data[['TV', 'radio', 'newspaper']] y = data['sales'] #画图 plt.figure(figsize=(9,12)) plt.subplot(311) plt.plot(data['TV'], y, 'ro') plt.title('TV') plt.grid() plt.subplot(312) plt.plot(data['radio'], y, 'g^') plt.title('radio') plt.grid() plt.subplot(313) plt.plot(data['newspaper'], y, 'b*') plt.title('newspaper') plt.grid() plt.tight_layout() plt.show() #创建一个Python特性名称列表 feature_cols = ['TV', 'radio', 'newspaper'] #使用该列表选择原始帧的一个子集 X = data[feature_cols] print (X.head()) #检查x的类型和形状。 print (type(X)) print (X.shape) #选择从一系列的数据帧 y = data['sales'] print (y.head()) #构建训练集与测试集 X_train,X_test, y_train, y_test = train_test_split(X, y, random_state=1) print ("-----------shape:-------------") print (X_train.shape) print (y_train.shape) print (X_test.shape) print (y_test.shape) #sklearn的线性回归 linreg = LinearRegression() model=linreg.fit(X_train, y_train) print ("-----------model:-------------") print (model) print ("-----------linreg.intercept_:-------------") print (linreg.intercept_) print ("-----------linreg.coef_:-------------") print (linreg.coef_) # pair the feature names with the coefficients zip(feature_cols, linreg.coef_) # 预测 y_pred = linreg.predict(X_test) print ("-----------y_pred-------------") print (y_pred) print (type(y_pred)) #回归问题的评价测度 print ("-----------回归问题的评价测度-------------") print (type(y_pred),type(y_test)) print (len(y_pred),len(y_test)) print (y_pred.shape,y_test.shape) sum_mean=0 for i in range(len(y_pred)): sum_mean+=(y_pred[i]-y_test.values[i])**2 print ("RMSE by hand:", np.sqrt(sum_mean/len(y_pred))) #作图 plt.figure() plt.plot(range(len(y_pred)),y_pred,'b',label="predict") plt.plot(range(len(y_pred)),y_test,'r',label="test") plt.legend(loc="upper right") #显示图中的标签 plt.show()